#Importing the libraries to watch the 'fits' image and get the data array
import astropy
#import plotly.graph_objects as go
from astropy.io import fits
#Importing a library that is useful to read the original file
import pandas as pd
import pylab as plb
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy import asarray as ar,exp
#Importing a visual library with some illustrative set up
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib import cm
import numpy as np
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.decomposition import PCA
import math
import seaborn as sns
from sklearn.linear_model import LogisticRegression
plt.style.use('fivethirtyeight')
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
#plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 12
plt.rcParams['image.cmap'] = 'jet'
plt.rcParams['image.interpolation'] = 'none'
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['lines.markersize'] = 8
plt.rcParams["axes.grid"] = False
data=pd.read_csv('star.txt',sep='\s+')
x=np.array(data.F606W-data.F814W)
y=np.array(data.F606W)
data.head()
notar=data.drop(columns=['Sharp','#ID'])
pca=PCA(n_components=3)
pca=pca.fit(notar)
pca_data=pd.DataFrame(pca.transform(notar))
pca_data=pca_data.rename(columns={0:'FirstComponent',1:'SecondComponent',2:'ThirdComponent'})
pca_data.tail()
notar.corr()
pca_data.corr()
COL_NAMES=pca_data.columns.tolist()
k=1
for i in range(3):
col=COL_NAMES[i]
for j in range(3):
if k==1:
plt.subplot(3,3,k)
plt.subplots_adjust(left=0.025, bottom=0.1, right=0.9, top=1.5, wspace=0.2, hspace=0.7)
plt.plot(pca_data[col],pca_data[COL_NAMES[j]],color='k')
plt.xlabel(COL_NAMES[i])
plt.ylabel(COL_NAMES[j])
else:
plt.subplot(3,3,k)
plt.subplots_adjust(left=0.025, bottom=0.1, right=0.9, top=1.5, wspace=0.2, hspace=0.7)
plt.plot(pca_data[col],pca_data[COL_NAMES[j]],',',color='k')
plt.xlabel(COL_NAMES[i])
plt.ylabel(COL_NAMES[j])
k=k+1
from sklearn.feature_selection import mutual_info_regression as mi
COL_NAMES=data.columns.tolist()
MI=[]
for col in COL_NAMES[1:-1]:
MI.append(mi(np.array(data[col].tolist()).reshape(-1,1),np.array(data.Sharp)))
print ('Mutual information computed between '+ str(col)+ ' and Sharp')
NEW_MI=[]
for i in range(len(MI)):
NEW_MI.append(MI[i][0])
MI_data=pd.DataFrame({'Column':COL_NAMES[1:-1],'Mutual Information':NEW_MI})
MI_data.sort_values(by='Mutual Information',ascending=False).head(3)
PCA_COL_NAMES=pca_data.columns.tolist()
MI=[]
for col in PCA_COL_NAMES:
MI.append(mi(np.array(pca_data[col].tolist()).reshape(-1,1),np.array(data.Sharp)))
print ('Mutual information computed between '+ str(col)+ ' and Sharp')
PCA_MI=[]
for i in range(len(MI)):
PCA_MI.append(MI[i][0])
PCA_MI_data=pd.DataFrame({'Column':PCA_COL_NAMES,'Mutual Information':PCA_MI})
PCA_MI_data
notar=data.drop(columns=['Sharp','#ID','X','Y'])
notar.head()
pca=PCA(n_components=3)
pca=pca.fit(notar)
pca_data=pd.DataFrame(pca.transform(notar))
pca_data=pca_data.rename(columns={0:'FirstComponent',1:'SecondComponent',2:'ThirdComponent'})
COL_NAMES=pca_data.columns.tolist()
k=1
q=0
for i in range(3):
col=COL_NAMES[i]
for j in range(3):
if k==1 or k==5 or k==9:
plt.subplot(3,3,k)
plt.subplots_adjust(left=0.025, bottom=0.1, right=0.9, top=1.5, wspace=0.2, hspace=0.7)
sns.kdeplot(pca_data[COL_NAMES[i]],color='darkorange')
#g._legend.remove()
plt.grid(True)
plt.xlabel('Values')
plt.legend([],[], frameon=False)
plt.xlabel(COL_NAMES[i])
plt.ylabel('Distribution')
#plt.ylabel(COL_NAMES[j])
else:
plt.subplot(3,3,k)
plt.subplots_adjust(left=0.025, bottom=0.1, right=0.9, top=1.5, wspace=0.2, hspace=0.7)
plt.plot(pca_data[col],pca_data[COL_NAMES[j]],',',color='k')
plt.xlabel(COL_NAMES[i])
plt.ylabel(COL_NAMES[j])
plt.grid(True)
k=k+1
plt.subplot(1,2,1)
plt.plot(pca_data['ThirdComponent'],pca_data['FirstComponent'],',',color='gold')
plt.xlabel('PCA Third Component ')
plt.ylabel('PCA First Component')
plt.grid(True)
plt.subplot(1,2,2)
plt.ylim(30.5,12.5)
plt.ylabel('814 nm Flux')
plt.xlabel('Stellar Color')
plt.plot(np.array(data['F814W']-data['F606W']),data.F814W,',',color='gold')
plt.grid(True)
plt.plot(-pca_data.FirstComponent,data.F814W,',',color='k')
plt.plot((data.F606W-26.5)*1.3+5*data.error,data.F814W,',',color='red')
##plt.plot((data.F606W-26.2)*1.3-data.error,data.F814W,',',color='purple')
#plt.plot(-pca_data.FirstComponent,data.F606W,',',color='red')
plt.grid(True)
from sklearn.metrics import mean_squared_error
A=np.arange(1,3,0.1)
B=np.arange(-30.5,-22.5,0.5)
C=np.arange(-15,15,1)
orig=-pca_data.FirstComponent
max_pca=np.abs(-pca_data.FirstComponent.max())
RMSE=[]
TRIPLET=[]
for a in A:
for b in B:
for c in C:
recons=(data.F606W+b)*a+c*data.error
RMSE.append(np.sqrt(mean_squared_error(recons,orig)))
TRIPLET.append([a,b,c])
#np.array(RMSE).argmin()
a_opt=TRIPLET[np.array(RMSE).argmin()][0]
b_opt=TRIPLET[np.array(RMSE).argmin()][1]
c_opt=TRIPLET[np.array(RMSE).argmin()][2]
r_opt=(data.F606W+b_opt)*a_opt+c_opt*data.error
D=np.arange(-10,10,0.1)
BEST_RMSE=[]
for d in D:
recons=r_opt+d*data.F814W
BEST_RMSE.append(np.sqrt(mean_squared_error(recons,orig)))
BEST_RMSE=np.array(BEST_RMSE)
d_opt=D[BEST_RMSE.argmin()]
r_opt=(data.F606W+b_opt)*a_opt+c_opt*data.error+d_opt*data.F814W
E=np.arange(-10,10,1)
BEST_RMSE=[]
for e in E:
recons=r_opt+e*data['error.1']
BEST_RMSE.append(np.sqrt(mean_squared_error(recons,orig)))
BEST_RMSE=np.array(BEST_RMSE)
e_opt=E[np.array(BEST_RMSE).argmin()]
r_opt_first=r_opt+e_opt*data['error.1']
first_RMSE=np.array(BEST_RMSE).min()/max_pca
plt.plot(-pca_data.FirstComponent,data.F814W,',',color='k')
plt.plot(r_opt,data.F814W,',',color='red')
plt.grid(True)
#plt.plot(np.array(data.Chi),np.array(data.F814W-data.F606W),',')
plt.plot(np.array(data.Chi),pca_data['FirstComponent'],',',color='red')
plt.plot(pca_data.SecondComponent*0.96+2.276,pca_data['FirstComponent'],',',color='black')
r_opt_sec=pca_data.SecondComponent*0.96+2.276
second_RMSE=np.sqrt(mean_squared_error(pca_data.SecondComponent*0.96+2.276,data.Chi))/data.Chi.max()
plt.ylim(30.0,12.5)
plt.plot(np.array(data['F814W']-data['F606W']),data.F814W,',',color='red')
plt.plot(-1.24+1.31*pca_data.ThirdComponent,data.F814W,',',color='k')
r_opt_third=-1.24+1.31*pca_data.ThirdComponent
third_RMSE=np.sqrt(mean_squared_error(-1.24+1.31*pca_data.ThirdComponent,np.array(data.F814W-data.F606W)))
third_RMSE=third_RMSE/np.array(data['F814W']-data['F606W']).max()
pca_data['X']=data.X
pca_data['Y']=data.Y
pca_data['Sharp']=data.Sharp
pca_data.head()
#pca_data=pca_data.drop(columns=['Sharp'])
PCA_COL_NAMES=pca_data.columns.tolist()
MI=[]
for col in PCA_COL_NAMES[0:-1]:
MI.append(mi(np.array(pca_data[col].tolist()).reshape(-1,1),np.array(data.Sharp)))
print ('Mutual information computed between '+ str(col)+ ' and Sharp')
NEW_MI=[]
for i in range(len(MI)):
NEW_MI.append(MI[i][0])
MI_data=pd.DataFrame({'Column':PCA_COL_NAMES[0:-1],'Mutual Information':NEW_MI})
MI_data.sort_values(by='Mutual Information', ascending=False)
data['SharpSign']=data.Sharp.apply(np.sign)
data['SharpSign']=data['SharpSign'].replace(0,1)
pca_data['Target']=data['SharpSign']
pca_data.head()
from sklearn.svm import LinearSVC as SVC
from sklearn.model_selection import train_test_split
X=pca_data.drop(columns=['Sharp','Target'])
y=pca_data.Target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
X_traint,X_train_val,y_traint,y_train_val= train_test_split(
X_train, y_train, test_size=0.2, random_state=42)
C_SCORE=[]
c=np.arange(0.2,1.2,0.2)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
C_SCORE=[]
c=np.arange(1,11,1)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
C_SCORE=[]
c=np.arange(10,110,10)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
OPT_C=[0.6,7,30]
FIN_SCORE=[]
for opt_C in OPT_C:
clf=SVC(C=opt_c)
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
FIN_SCORE.append(fin_score)
FIN_SCORE=np.array(FIN_SCORE)
fin_score=FIN_SCORE.max()
print ('The PCA dataset gave a best classification with ' + str(fin_score*100)+ '% of accuracy with a linear classifier')
X=data.drop(columns=['SharpSign'])
X=pca_data.drop(columns=['Sharp','Target'])
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
X_traint,X_train_val,y_traint,y_train_val= train_test_split(
X_train, y_train, test_size=0.2, random_state=42)
C_SCORE=[]
c=np.arange(0.2,1.2,0.2)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
C_SCORE=[]
c=np.arange(1,11,1)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
C_SCORE=[]
c=np.arange(10,110,10)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
OPT_C=[1,70]
FIN_SCORE=[]
for opt_C in OPT_C:
clf=SVC(C=opt_c)
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
FIN_SCORE.append(fin_score)
FIN_SCORE=np.array(FIN_SCORE)
fin_score=FIN_SCORE.max()
print ('The PCA dataset gave a best classification with ' + str(fin_score*100)+ '% of accuracy with a linear classifier')
opt_data=data.drop(columns=['#ID','X','Y','Chi','Sharp','F606W'])
opt_data.head()
X=opt_data.drop(columns=['SharpSign'])
data['SharpSign']=data.Sharp.apply(np.sign)
data['SharpSign']=data['SharpSign'].replace(0,1)
y=data.SharpSign
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
X_traint,X_train_val,y_traint,y_train_val= train_test_split(
X_train, y_train, test_size=0.2, random_state=42)
C_SCORE=[]
c=np.arange(0.2,1.2,0.2)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
C_SCORE=[]
c=np.arange(1,11,1)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
C_SCORE=[]
c=np.arange(10,110,10)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
OPT_C=[0.2,1,40]
FIN_SCORE=[]
for opt_C in OPT_C:
clf=SVC(C=opt_c)
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
FIN_SCORE.append(fin_score)
FIN_SCORE=np.array(FIN_SCORE)
fin_score=FIN_SCORE.max()
print ('The PCA dataset gave a best classification with ' + str(fin_score*100)+ '% of accuracy with a linear classifier')
opt_data=pca_data[['FirstComponent','SecondComponent','ThirdComponent']]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
X_traint,X_train_val,y_traint,y_train_val= train_test_split(
X_train, y_train, test_size=0.2, random_state=42)
C_SCORE=[]
c=np.arange(0.2,1.2,0.2)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
C_SCORE=[]
c=np.arange(1,11,1)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
C_SCORE=[]
c=np.arange(10,110,10)
for c_value in c:
clf=SVC(C=c_value)
clf.fit(X_traint,y_traint)
sc=clf.score(X_train_val,y_train_val)
C_SCORE.append(sc)
print(str(c_value) + ' coefficient has been adopted' )
MIN_VAL=np.array(C_SCORE).max()
MIN_C=c[np.array(C_SCORE).argmax()]
print ('In the range ' + str(c.min())+ ' and ' +str(c.max())+ '\n')
print ('the best score has been obtained with ' + str(MIN_C))
print ('and it is ' + str(MIN_VAL))
OPT_C=[0.2,1,70]
FIN_SCORE=[]
for opt_C in OPT_C:
clf=SVC(C=opt_c)
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
FIN_SCORE.append(fin_score)
FIN_SCORE=np.array(FIN_SCORE)
fin_score=FIN_SCORE.max()
print ('The PCA dataset gave a best classification with ' + str(fin_score*100)+ '% of accuracy with a linear classifier')
opt_data=pca_data[['FirstComponent','SecondComponent']]
opt_data['Target']=data.SharpSign
opt_data
X=opt_data.drop(columns=['Target'])
sns.scatterplot(opt_data.FirstComponent,opt_data.SecondComponent,hue=opt_data.Target,palette='plasma')
plt.grid(True)
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
y=opt_data.Target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
c_list=np.arange(0.10,1.1,0.10)
k=0
FIN_SCORE=[]
for c in c_list:
clf=SVC(C=c, kernel='rbf')
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
FIN_SCORE.append(fin_score)
if k==1:
print('20% of the C values inspected')
if k==4:
print('50% of the C values inspected')
if k==7:
print ('80% of the C values inspected')
if k == 9:
print ('100% of the C values inspected \n')
print ('Process completed')
k=k+1
FIN_SCORE=np.array(FIN_SCORE)
fin_score=FIN_SCORE.max()
i=FIN_SCORE.argmax()
c_max=c_list[i]
print ('The best classification is done between '+ str(c_list.min()) + ' and ' + str(c_list.max()) + '\n')
print('is ' + str(c_max) +', obtaining the following accuracy: '+str(fin_score*100)+'%')
c_list=np.arange(1.,11,1.)
k=0
FIN_SCORE=[]
for c in c_list:
clf=SVC(C=c, kernel='rbf')
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
FIN_SCORE.append(fin_score)
if k==1:
print('20% of the C values inspected')
if k==4:
print('50% of the C values inspected')
if k==7:
print ('80% of the C values inspected')
if k == 9:
print ('100% of the C values inspected \n')
print ('Process completed')
k=k+1
FIN_SCORE=np.array(FIN_SCORE)
fin_score=FIN_SCORE.max()
i=FIN_SCORE.argmax()
c_max=c_list[i]
print ('The best classification is done between '+ str(c_list.min()) + ' and ' + str(c_list.max()) + '\n')
print('is ' + str(c_max) +', obtaining the following accuracy: '+str(fin_score*100)+'%')
c_list=np.arange(10,60,10)
k=0
FIN_SCORE=[]
for c in c_list:
clf=SVC(C=c, kernel='rbf')
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
FIN_SCORE.append(fin_score)
if k==0:
print('20% of the C values inspected')
if k==1:
print('40% of the C values inspected')
if k==2:
print('60% of the C values inspected')
if k==3:
print('80% of the C values inspected')
if k==4:
print('100% of the C values inspected')
k=k+1
FIN_SCORE=np.array(FIN_SCORE)
fin_score=FIN_SCORE.max()
i=FIN_SCORE.argmax()
c_max=c_list[i]
print ('The best classification is done between '+ str(c_list.min()) + ' and ' + str(c_list.max()) + '\n')
print('is ' + str(c_max) +', obtaining the following accuracy: '+str(fin_score*100)+'%')
c_list=np.arange(100,600,100)
k=0
FIN_SCORE=[]
for c in c_list:
clf=SVC(C=c, kernel='rbf')
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
FIN_SCORE.append(fin_score)
if k==0:
print('20% of the C values inspected \n')
if k==1:
print('40% of the C values inspected \n')
if k==2:
print('60% of the C values inspected \n')
if k==3:
print('80% of the C values inspected \n')
if k==4:
print('100% of the C values inspected \n')
print ('Process completed')
k=k+1
FIN_SCORE=np.array(FIN_SCORE)
fin_score=FIN_SCORE.max()
i=FIN_SCORE.argmax()
c_max=c_list[i]
print ('The best classification is done between '+ str(c_list.min()) + ' and ' + str(c_list.max()) + '\n')
print('is ' + str(c_max) +', obtaining the following accuracy: '+str(fin_score*100)+'%')
clf=SVC(C=1000, kernel='rbf')
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
print('The score for C=1000 is ' +str(fin_score*100) +'%')
clf=SVC(C=10000, kernel='rbf')
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
print('The score for C=10000 is ' +str(fin_score*100) +'%')
clf=SVC(C=100000, kernel='rbf')
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
print('The score for C=100000 is ' +str(fin_score*100) +'% \n')
print ('SVM best score: 72.2%')
pred=clf.predict(X_test)
pred_data=pd.DataFrame()
pred_data['FirstComponent']= X_test['FirstComponent']
pred_data['SecondComponent']=X_test['SecondComponent']
pred_data['Target']=y_test
pred_data['Prediction']=pred
pred_data.head()
pred_data.to_csv('SVMprediction.csv')
plt.subplot(2,1,1)
sns.scatterplot(pred_data.FirstComponent,pred_data.SecondComponent,hue=pred_data.Target,palette='plasma')
plt.grid(True)
plt.xlabel('First Component',fontsize=20)
plt.ylabel('Second Component',fontsize=20)
plt.subplot(2,1,2)
sns.scatterplot(pred_data.FirstComponent,pred_data.SecondComponent,hue=pred_data.Prediction,palette='plasma')
plt.grid(True)
plt.xlabel('First Component',fontsize=20)
plt.ylabel('Second Component',fontsize=20)
X_test
import matplotlib.cm as cm
xx, yy = np.meshgrid(np.linspace(-15, 20, 500),
np.linspace(-15, 25, 500))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
fig = plt.figure(figsize=(16,8))
fig.patch.set_facecolor('white')
ax = fig.gca()
imshow_handle = plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()), aspect='auto',
origin='lower', alpha=.5, cmap='plasma')
contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2,
linetypes='--', colors='red')
sns.scatterplot(pred_data.FirstComponent,pred_data.SecondComponent,hue=pred_data.Target,palette='plasma')
plt.xlabel('$x_1$', fontsize=14)
plt.ylabel('$x_2$', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
#plt.xlim(-3, 3)
#plt.ylim(-3, 3)
plt.legend()
plt.show()
opt_data=pca_data[['FirstComponent','SecondComponent','ThirdComponent']]
opt_data['Target']=data.SharpSign
y=opt_data.Target
import plotly.express as px
#df = px.data.iris()
fig = px.scatter_3d(opt_data, x='FirstComponent', y='SecondComponent', z='ThirdComponent',
color='Target')
fig.update_traces(marker=dict(size=2))
fig.show()
X=opt_data.drop(columns=['Target'])
y=opt_data.Target
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.9, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.5, random_state=42)
K_LIST=['linear', 'poly', 'rbf', 'sigmoid']
BEST_KERNEL=[]
k=0
for i in range(5):
FIN_SCORE=[]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.9, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.5, random_state=42)
for ker in K_LIST:
clf=SVC(kernel=ker)
clf.fit(X_train,y_train)
fin_score=clf.score(X_val,y_val)
FIN_SCORE.append(fin_score)
k=k+1
print(ker + ' Kernel has been explored')
FIN_SCORE=np.array(FIN_SCORE)
BEST_KERNEL.append(K_LIST[FIN_SCORE.argmax()])
print('Cross validation ' + str(i) + ' out of 4 \n')
CV_DATA=pd.DataFrame({'CV Number':np.arange(1,6,1),'Choosen Kernel': BEST_KERNEL})
sns.countplot(CV_DATA['Choosen Kernel'])
best_kernel='rbf'
len(c_list)
c_list=np.arange(0.5,50.5,0.5)
k=0
PERC=['20%','40%','60%','80%','100%']
K=[20,40,60,80,100]
BEST_C=[]
for i in range(5):
FIN_SCORE=[]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.9, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.5, random_state=42)
for c in c_list:
k=k+1
clf=SVC(C=c,kernel=best_kernel)
clf.fit(X_train,y_train)
fin_score=clf.score(X_val,y_val)
FIN_SCORE.append(fin_score)
#k=k+1
if k in K:
ind=K.index(k)
print (PERC[ind] + ' of the C values has been explored')
FIN_SCORE=np.array(FIN_SCORE)
BEST_C.append(c_list[FIN_SCORE.argmax()])
print('Cross validation ' + str(i) + ' out of 4 \n')
sns.countplot(BEST_C)
plt.xlabel('Chosen C')
plt.ylabel('Count')
FIN_SCORE=np.array(FIN_SCORE)
best_c=c_list[FIN_SCORE.argmax()]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.7, random_state=42)
clf=SVC(kernel=best_kernel,C=best_c)
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
prediction=clf.predict(X_test.drop(columns=['Target']))
print('The final score with 3 feature is ' + str(fin_score*100) +'% ')
test_data=X_test.copy()
test_data['Target']=y_test
test_data['Prediction']=prediction
test_data
import plotly.express as px
#fig=go.figure()
#plt.subplot(1,2,1)
fig = px.scatter_3d(test_data, x='FirstComponent', y='SecondComponent', z='ThirdComponent',
color='Prediction')
fig.update_traces(marker=dict(size=2))
fig.show()
fig = px.scatter_3d(test_data, x='FirstComponent', y='SecondComponent', z='Target',
color='Prediction')
fig.update_traces(marker=dict(size=2))
fig.show()
import itertools
from string import ascii_uppercase
from sklearn.metrics import confusion_matrix
y_test=test_data.Target
predic = prediction
columns = ['Negative','Non Negative']
confm = confusion_matrix(y_test, predic)
df_cm = pd.DataFrame(confm.astype(float), index=columns, columns=columns)
ax = sns.heatmap(df_cm, cmap='plasma',annot=True,fmt='g')
def precision(confusion):
TP=confusion[0][0]
TN=confusion[1][1]
FP=confusion[0][1]
FN=confusion[1][0]
pres_a=TP/(TP+FN)
pres_b=TN/(TN+FP)
return [pres_a,pres_b]
def recall(confusion):
TP=confusion[0][0]
TN=confusion[1][1]
FP=confusion[0][1]
FN=confusion[1][0]
rec_a=TP/(TP+FP)
rec_b=TN/(TN+FN)
return [rec_a,rec_b]
def statistics(confusion):
stat=pd.DataFrame({'Negative':[precision(confusion)[0],recall(confusion)[0]],'Non Negative':[precision(confusion)[1],recall(confusion)[1]]})
stat.index=['Precision','Recall']
return stat
statistics(confm)
opt_data=pca_data[['FirstComponent','SecondComponent','ThirdComponent']]
data['SharpSign']=data.Sharp.apply(np.sign)
opt_data['Target']=data['SharpSign']
y=opt_data.Target
import plotly.express as px
#df = px.data.iris()
fig = px.scatter_3d(opt_data, x='FirstComponent', y='SecondComponent', z='ThirdComponent',
color='Target')
fig.update_traces(marker=dict(size=2))
fig.show()
X=opt_data.drop(columns=['Target'])
y=opt_data.Target
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.9, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.5, random_state=42)
K_LIST=['linear', 'poly', 'rbf', 'sigmoid']
BEST_KERNEL=[]
k=0
for i in range(5):
FIN_SCORE=[]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.9, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.5, random_state=42)
k=0
for ker in K_LIST:
clf=SVC(kernel=ker)
clf.fit(X_train,y_train)
fin_score=clf.score(X_val,y_val)
FIN_SCORE.append(fin_score)
k=k+1
print(ker + ' Kernel has been explored')
FIN_SCORE=np.array(FIN_SCORE)
BEST_KERNEL.append(K_LIST[FIN_SCORE.argmax()])
print('Cross validation ' + str(i) + ' out of 4 \n')
CV_DATA=pd.DataFrame({'CV Number':np.arange(1,6,1),'Choosen Kernel': BEST_KERNEL})
sns.countplot(CV_DATA['Choosen Kernel'])
best_kernel='rbf'
c_list=np.arange(0.5,50.5,0.5)
k=0
PERC=['20%','40%','60%','80%','100%']
K=[20,40,60,80,100]
BEST_C=[]
for i in range(5):
FIN_SCORE=[]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.9, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.5, random_state=42)
k=0
for c in c_list:
k=k+1
clf=SVC(C=c,kernel=best_kernel)
clf.fit(X_train,y_train)
fin_score=clf.score(X_val,y_val)
FIN_SCORE.append(fin_score)
#k=k+1
if k in K:
ind=K.index(k)
print (PERC[ind] + ' of the C values has been explored')
FIN_SCORE=np.array(FIN_SCORE)
BEST_C.append(c_list[FIN_SCORE.argmax()])
print('Cross validation ' + str(i) + ' out of 4 \n')
sns.countplot(BEST_C)
plt.xlabel('Chosen C')
plt.ylabel('Count')
FIN_SCORE=np.array(FIN_SCORE)
best_c=c_list[FIN_SCORE.argmax()]
best_c=17
best_kernel='rbf'
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.7, random_state=42)
clf=SVC(kernel=best_kernel,C=best_c)
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
prediction=clf.predict(X_test)
print('The final score with 3 feature is ' + str(fin_score*100) +'% ')
test_data=X_test.copy()
test_data['Target']=y_test
test_data['Prediction']=prediction
test_data
import plotly.express as px
#fig=go.figure()
#plt.subplot(1,2,1)
fig = px.scatter_3d(test_data, x='FirstComponent', y='SecondComponent', z='ThirdComponent',
color='Prediction')
fig.update_traces(marker=dict(size=2))
fig.show()
fig = px.scatter_3d(test_data, x='FirstComponent', y='SecondComponent', z='Target',
color='Prediction')
fig.update_traces(marker=dict(size=2))
fig.show()
import itertools
from string import ascii_uppercase
from sklearn.metrics import confusion_matrix
y_test=test_data.Target
predic = prediction
columns = ['Negative','Zero','Positive']
confm = confusion_matrix(y_test, predic)
df_cm = pd.DataFrame(confm.astype(float), index=columns, columns=columns)
ax = sns.heatmap(df_cm, cmap='plasma',annot=True,fmt='g')
def precision(confusion,clas):
if clas=='Negative':
TP=confusion[0][0]
FN=confusion[1][0]+confusion[2][0]
pres=TP/(TP+FN)
if clas=='Positive':
TP=confusion[2][2]
FN=confusion[2][0]+confusion[2][1]
pres=TP/(TP+FN)
if clas=='Zero':
TP=confusion[1][1]
FN=confusion[1][0]+confusion[1][2]
pres=TP/(TP+FN)
return pres
def recal(confusion,clas):
if clas=='Negative':
TP=confusion[0][0]
FP=confusion[0][1]+confusion[0][2]
rec=TP/(TP+FP)
if clas=='Positive':
TP=confusion[2][2]
FP=confusion[0][2]+confusion[1][2]
rec=TP/(TP+FP)
if clas=='Zero':
TP=confusion[1][1]
FP=confusion[0][1]+confusion[2][1]
rec=TP/(TP+FP)
return rec
precision(confm,'Negative'),precision(confm,'Positive'),precision(confm,'Zero')
recal(confm,'Negative'),recal(confm,'Positive'),recal(confm,'Zero')
def recall(confusion):
TP=confusion[0][0]
TN=confusion[1][1]
FP=confusion[0][1]
FN=confusion[1][0]
rec_a=TP/(TP+FP)
rec_b=TN/(TN+FN)
return [rec_a,rec_b]
zero=[precision(confm,'Zero'),recal(confm,'Zero')
def statistics(confusion):
neg=[precision(confm,'Negative'),recal(confm,'Negative')]
pos=[precision(confm,'Positive'),recal(confm,'Positive')]
zero=[precision(confm,'Zero'),recal(confm,'Zero')]
stats=pd.DataFrame({'Negative':neg,'Positive':pos,'Zero':zero})
stats.index=['Precision','Recall']
return stats
statistics(confm)
opt_data=pca_data[['FirstComponent','SecondComponent']]
data['SharpSign']=data.Sharp.apply(np.sign)
opt_data['Target']=data['SharpSign']
sns.scatterplot(opt_data.FirstComponent,opt_data.SecondComponent,hue=opt_data.Target,palette='plasma')
plt.grid(True)
X=opt_data.drop(columns=['Target'])
y=opt_data.Target
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.9, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.5, random_state=42)
K_LIST=['linear', 'poly', 'rbf', 'sigmoid']
BEST_KERNEL=[]
k=0
for i in range(5):
FIN_SCORE=[]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.9, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.5, random_state=42)
for ker in K_LIST:
clf=SVC(kernel=ker)
clf.fit(X_train,y_train)
fin_score=clf.score(X_val,y_val)
FIN_SCORE.append(fin_score)
k=k+1
print(ker + ' Kernel has been explored')
FIN_SCORE=np.array(FIN_SCORE)
BEST_KERNEL.append(K_LIST[FIN_SCORE.argmax()])
print('Cross validation ' + str(i) + ' out of 4 \n')
sns.countplot(BEST_KERNEL)
best_kernel='rbf'
c_list=np.arange(0.5,50.5,0.5)
k=0
PERC=['20%','40%','60%','80%','100%']
K=[20,40,60,80,100]
BEST_C=[]
for i in range(5):
FIN_SCORE=[]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.9, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
X_train, y_train, test_size=0.5, random_state=42)
k=0
for c in c_list:
k=k+1
clf=SVC(C=c,kernel=best_kernel)
clf.fit(X_train,y_train)
fin_score=clf.score(X_val,y_val)
FIN_SCORE.append(fin_score)
#k=k+1
if k in K:
ind=K.index(k)
print (PERC[ind] + ' of the C values has been explored')
FIN_SCORE=np.array(FIN_SCORE)
BEST_C.append(c_list[FIN_SCORE.argmax()])
print('Cross validation ' + str(i) + ' out of 4 \n')
sns.countplot(BEST_C)
plt.xlabel('Chosen C')
plt.ylabel('Count')
FIN_SCORE=np.array(FIN_SCORE)
best_c=14.5
best_kernel='rbf'
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.7, random_state=42)
clf=SVC(kernel=best_kernel,C=best_c)
clf.fit(X_train,y_train)
fin_score=clf.score(X_test,y_test)
prediction=clf.predict(X_test)
print('The final score with 2 feature is ' + str(fin_score*100) +'% ')
pred_data=X_test.copy()
pred_data['Target']=y_test
pred_data['Prediction']=prediction
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
def make_meshgrid(x, y, h=.4):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
y=y_test
fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of linear SVC ')
# Set-up grid for plotting.
X0, X1 = X_test['FirstComponent'], X_test['SecondComponent']
xx, yy = make_meshgrid(X0, X1)
plot_contours(ax, clf, xx, yy, cmap='plasma', alpha=0.8)
ax.scatter(X0, X1, c=y, cmap='plasma', s=20, edgecolors='k')
ax.set_ylabel('Second Component')
ax.set_xlabel('First Component')
violet_patch = mpatches.Patch(color='navy', label='Sharp<0')
yellow_patch = mpatches.Patch(color='gold', label='Sharp>0')
pink_patch = mpatches.Patch(color='magenta', label='Sharp=0')
plt.legend(handles=[violet_patch,yellow_patch,pink_patch])
ax.set_xticks(())
ax.set_yticks(())
ax.set_title('Decision Surface', fontsize=20)
#ax.legend()
plt.show()
test_data=pred_data
import itertools
from string import ascii_uppercase
from sklearn.metrics import confusion_matrix
y_test=test_data.Target
predic = prediction
columns = ['Negative','Zero','Positive']
confm = confusion_matrix(y_test, predic)
df_cm = pd.DataFrame(confm.astype(float), index=columns, columns=columns)
ax = sns.heatmap(df_cm, cmap='plasma',annot=True,fmt='g')
def precision(confusion,clas):
if clas=='Negative':
TP=confusion[0][0]
FN=confusion[1][0]+confusion[2][0]
pres=TP/(TP+FN)
if clas=='Positive':
TP=confusion[2][2]
FN=confusion[2][0]+confusion[2][1]
pres=TP/(TP+FN)
if clas=='Zero':
TP=confusion[1][1]
FN=confusion[1][0]+confusion[1][2]
pres=TP/(TP+FN)
return pres
def recal(confusion,clas):
if clas=='Negative':
TP=confusion[0][0]
FP=confusion[0][1]+confusion[0][2]
rec=TP/(TP+FP)
if clas=='Positive':
TP=confusion[2][2]
FP=confusion[0][2]+confusion[1][2]
rec=TP/(TP+FP)
if clas=='Zero':
TP=confusion[1][1]
FP=confusion[0][1]+confusion[2][1]
rec=TP/(TP+FP)
return rec
precision(confm,'Negative'),precision(confm,'Positive'),precision(confm,'Zero')
recal(confm,'Negative'),recal(confm,'Positive'),recal(confm,'Zero')
def recall(confusion):
TP=confusion[0][0]
TN=confusion[1][1]
FP=confusion[0][1]
FN=confusion[1][0]
rec_a=TP/(TP+FP)
rec_b=TN/(TN+FN)
return [rec_a,rec_b]
def statistics(confusion):
neg=[precision(confm,'Negative'),recal(confm,'Negative')]
pos=[precision(confm,'Positive'),recal(confm,'Positive')]
zero=[precision(confm,'Zero'),recal(confm,'Zero')]
stats=pd.DataFrame({'Negative':neg,'Positive':pos,'Zero':zero})
stats.index=['Precision','Recall']
return stats
statistics(confm)
wrong_data=pred_data[pred_data['Prediction']==1.0].drop(columns=['Target'])
wrong_target=pred_data[pred_data['Prediction']==1.0].Target
sns.scatterplot(wrong_data.FirstComponent,wrong_data.SecondComponent,hue=wrong_target)
X=data.loc[X_test.index].drop(columns=['#ID','Sharp','SharpSign'])
y=data.loc[X_test.index].SharpSign
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
rf=RandomForestClassifier(n_estimators=500, min_samples_split = 20, max_features = 5)
# Train the model on training data
rf.fit(X_train, y_train)
feature_names = ['X','Y','F606W','error','F814W','error.1','Chi']
# Creating a bar plot
feature_imp = pd.Series(rf.feature_importances_,index=feature_names).sort_values(ascending=False)
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()
Results=X_test.copy()
Results['Target']=y_test
Results['Pred']=rf.predict(X_test)
Results['FirstComponent']=opt_data['FirstComponent'].loc[Results.index]
Results['SecondComponent']=opt_data['SecondComponent'].loc[Results.index]
good_ones=pred_data[(pred_data.Prediction==0.) |(pred_data.Prediction==-1.0)]
import plotly.express as px
fig = px.scatter_3d(Results, x='error', y='Y', z='Chi',
color='Pred')
fig.update_traces(marker=dict(size=2))
fig.show()
for feat in feature_names:
good_ones[feat]=data[feat].loc[good_ones.index]
good_ones=good_ones.rename(columns={'Prediction':'Pred'})
Results=Results.append(good_ones)
from sklearn.metrics import accuracy_score
acc=accuracy_score(Results.Pred, Results.Target)
import itertools
from string import ascii_uppercase
from sklearn.metrics import confusion_matrix
y_test=Results.Target
predic = Results.Pred
columns = ['Negative','Zero','Positive']
confm = confusion_matrix(y_test, predic)
df_cm = pd.DataFrame(confm.astype(float), index=columns, columns=columns)
ax = sns.heatmap(df_cm, cmap='plasma',annot=True,fmt='g')
def statistics(confusion):
neg=[precision(confm,'Negative'),recal(confm,'Negative')]
pos=[precision(confm,'Positive'),recal(confm,'Positive')]
zero=[precision(confm,'Zero'),recal(confm,'Zero')]
stats=pd.DataFrame({'Negative':neg,'Positive':pos,'Zero':zero})
stats.index=['Precision','Recall']
return stats
statistics(confm)
Tot_res=pd.DataFrame({'Performance':[71,74,80,82]})
Tot_res.index=['SVM','Decision Tree','Random Forest','Ensemble Learning']
Tot_res
sns.barplot(x=Tot_res.index,y=Tot_res.Performance,palette='plasma')
plt.xlabel('Method',fontsize=20)
plt.ylabel('Accuracy (%)',fontsize=20)
plt.grid(True)